########################gene annotation set-based gene prioritization method########################

library(data.table)
library(tidyverse)
library(clusterProfiler)
library(org.Hs.eg.db)
library(msigdbr)



##Method 1: Evaluate the p-values of gold standard genes and candidate genes based on attributes########

####1.0 写成一个方法，基于属性的只需要输入两个数据框####
#train_attribution: "attribution": attribute column "score": score column (mostly using p-value)
#test_annotations: "ENTREZID": gene entrez_id "SYMBOL": gene symbol name "attribution": attribute annotated to the gene
fun.base.attribution<- function(train_attribution,test_annotations){
  
  # 为每个测试基因计算匹配分数
  calculate_matching_scores <- function(test_gene, test_annotations, train_attribution) {
    #print(test_gene)
    #test_gene <- test.genes[5]
    attributions <- test_annotations[test_annotations$ENTREZID == test_gene, "attribution"]
    matched_p_values <- train_attribution[train_attribution$attribution %in% attributions$attribution, "score"]
    
    if (length(matched_p_values) == 0) {
      return(1)  # 如果没有匹配的显著GO术语，返回最高p值1
    }
    
    # 使用Fisher's Omnibus Procedure计算综合p值
    S <- -2 * sum(log(matched_p_values))
    combined_p_value <- pchisq(S, df = 2 * length(matched_p_values), lower.tail = FALSE)
    
    return(combined_p_value)
  }
  
  # 对每个测试基因计算综合p值
  test.genes <- unique(test_annotations$ENTREZID)
  test_genes_scores <- sapply(test.genes, calculate_matching_scores, test_annotations, train_attribution)
  names(test_genes_scores) <- test.genes
  test_genes_scores <- test_genes_scores %>% data.frame() %>% rename("."="score") %>% rownames_to_column("ENTREZID")
  
  test_annotations$ENTREZID<-as.character(test_annotations$ENTREZID)
  test_genes_scores$ENTREZID <- as.character(test_genes_scores$ENTREZID)
  go_result_reorder<-  test_annotations %>% dplyr::select(c("SYMBOL","ENTREZID")) %>% unique()%>% right_join(test_genes_scores,by="ENTREZID")
  go_result_reorder <- go_result_reorder[order(go_result_reorder$score),]
  return(go_result_reorder)
  
}

result_sensit <- data.frame(matrix(nrow = 100,ncol = 2))
for (i in 1:100) {

##Read the gold standard gene set
#(1) Gold standard gene set: only symbol and entrez id
all_result <- fread(".\\RA_gene_gold_matrix.txt",sep = "\t")%>% mutate(order = rank(P.value,na.last = T))
defector<- sample(all_result$Symbol[1:100],1)

print(str_c(i,"--",defector))
result_sensit[i,1] <- defector
train.list <-  all_result[which(all_result$P.value<=0.05 & all_result$Symbol!=defector),] %>% dplyr::select(Symbol,entrez_id)
symbol2entrzid <- data.frame(Symbol=all_result$Symbol,entrez_id=all_result$entrez_id)

# (2) Candidate genes: Genes are randomly selected here
test.list <- left_join(data.frame(Symbol = c(defector,
                                             sample(all_result$Symbol[which(all_result$Symbol!=defector)], 99)
)), symbol2entrzid)
unique(test.list$Symbol) %>% length()


####1.1 Properties in the MSigDB database####
library(msigdbr)

genes<- train.list$entrez_id
msigdb_term <- msigdbr(species = "Homo sapiens")
unique(msigdb_term$gs_cat)

base.attribution.result <- list()

for (cate in unique(msigdb_term$gs_cat)) {
  #cata <- unique(msigdb_term$gs_cat)[7]
  term<- msigdb_term %>% dplyr::filter(gs_cat==sym(cate))
  enricher_result<- enricher(gene=genes,TERM2GENE =dplyr::select(term,gs_name,entrez_gene) )
  enricher_result <- enricher_result@result
  train_attribution <- enricher_result %>% dplyr::select("ID","pvalue") %>% dplyr::rename("attribution"="ID","score"="pvalue")
  
  test_annotations <- term %>%
    dplyr::filter( entrez_gene %in% test.list$entrez_id) %>%
    dplyr::select("gs_name","gene_symbol","entrez_gene") %>%
    dplyr::rename("ENTREZID"="entrez_gene","SYMBOL"="gene_symbol","attribution"="gs_name")
  test.enrich<- fun.base.attribution(train_attribution,test_annotations)
  test.enrich$category <- cate
  test.enrich <- dplyr::rename(test.enrich,"Symbol"="SYMBOL")
  base.attribution.result[[cate]] <- test.enrich
}

base.attribution.result1<- bind_rows(base.attribution.result)
base.attribution.result <- pivot_wider(base.attribution.result1, names_from = category, values_from = score)
base.attribution.result %<>% dplyr::rename('symbol'="Symbol")

#Use order statistics to get the integration order
library(OS.JCDF)

base.attrib.test<- get.input.form(
  datapath=base.attribution.result,
  evidence.matrix = T,
  gene.colname = c("symbol", "entrez_id"),
  statistics.colname = "P.Value",
  form = "order.rate",
  add.entrezid = T
)
test_result<- os.jcdf(base.attrib.test,ncol(base.attrib.test))
test_result<- test_result[[1]] %>% as.data.frame() %>% left_join(data.frame('symbol'=all_result$Symbol,'order'=all_result$order),by='symbol')
result_sensit[i,2]<- order(test_result$P.value)[which(test_result$symbol==defector)]< 10
}
